total_df <- read.csv('total_df.csv')
total_df <- total_df[, -1]
head(total_df)
## loss accuracy data unit layer_num activation drop_out_rate batch_size
## 1 0.4254220 0.8823088 train 10 2 sigmoid NA NA
## 2 0.5325518 0.8557114 test 10 2 sigmoid NA NA
## 3 0.5575149 0.9125437 train 10 2 relu NA NA
## 4 0.6650388 0.8567134 test 10 2 relu NA NA
## 5 0.3702507 0.9237881 train 25 2 sigmoid NA NA
## 6 0.4651627 0.8727455 test 25 2 sigmoid NA NA
다양한 실험을 통해서 만들어진 Neural Network Model의 결과를 불러오도록 한다.
total_df <- data.frame()
# Output layer를 포함하여 Layer의 개수가 2개
for (unit_num in c(10, 25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
# (1) Model Setting
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dense(units = 10, activation = 'softmax')
# (2) Training
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
# (3) Model Output
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$layer_num <- 2
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}
# Output layer를 포함하여 Layer의 개수가 3개
for (unit_num in c(10, 25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$layer_num <- 3
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}
# Output layer를 포함하여 Layer의 개수가 4개
for (unit_num in c(10, 25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$layer_num <- 4
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}
model <- keras_model_sequential()
model %>%
layer_dense(units = 100, activation = 'sigmoid', input_shape = c(400)) %>%
layer_dense(units = 100, activation = 'sigmoid') %>%
layer_dense(units = 10, activation = 'softmax')
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_2 (Dense) (None, 100) 40100
##
## dense_1 (Dense) (None, 100) 10100
##
## dense (Dense) (None, 10) 1010
##
## ================================================================================
## Total params: 51,210
## Trainable params: 51,210
## Non-trainable params: 0
## ________________________________________________________________________________
## Training
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 200, validation_split = 0.1)
model %>% evaluate(train_x, train_y)
## loss accuracy
## 0.6812689 0.9335333
model %>% evaluate(test_x, test_y)
## loss accuracy
## 0.7579519 0.9078156
plot(history)
첫번째와 두번째 문제에 대해 반복문을 이용하여 각각에 대한 결과들을 한번에 저장해보도록 한다.
위에서 볼 수 있듯, unit의 개수나, activation 함수를 바꿔가면서 그에 따른 모델의 결과를 total_df에 누적하도록 한다.
또한 dense layer를 2, 3, 4개씩으로 설정하여 서로 다른 반복문을 통해 최종적인 결과를 확인해보도록 한다.
relu <- total_df %>%
filter(is.na(drop_out_rate)) %>%
filter(activation == 'relu')
g1 <- total_df %>%
filter(is.na(drop_out_rate)) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = accuracy,
text = paste('Accuracy: ', round(accuracy, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'activation', colour = 'sigmoid')) + geom_point(color = 'black', size = 1) +
geom_line(data = relu, aes(group = 'activation', color = 'relu')) + geom_point(data = relu, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function', values = c('sigmoid' = 'red', 'relu' = 'blue')) +
ylim(c(0.82, 0.95)) + ggtitle('Change # of layer, unit of layer - Accuracy') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
g1 <- total_df %>%
filter(is.na(drop_out_rate)) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = loss,
text = paste('Loss: ', round(loss, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'activation', colour = 'sigmoid')) + geom_point(color = 'black', size = 1) +
geom_line(data = relu, aes(group = 'activation', color = 'relu')) + geom_point(data = relu, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function', values = c('sigmoid' = 'red', 'relu' = 'blue')) +
ylim(c(0.3, 1)) + ggtitle('Change # of layer, unit of layer - Loss') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
가장 먼저는 Layer의 개수와 Unit의 개수 그리고 Activation을 바꿨던 결과에 대한 결과 그래프이다.
X축은 Unit의 개수이고, Y축은 Accuracy와 Loss 값, 그리고 빨간색은 sigmoid와 파란색은 relu 함수이다.
Accuracy와 Loss를 각각 따로 표현했고, 이에 따른 결과를 확인해보면 다음과 같음을 알 수 있다.
전반적인 결과는 train 데이터에서는 0.9부터 0.93 정도까지, test 데이터에서는 0.9부터 0.92 정도까지로 나온다.
전반적으로 Accuracy의 결과 측면에서는 sigmoid 보다 relu 활성화 함수가 더 정확도가 높다.
Layer의 개수를 늘릴수록, 그리고 Layer 당 Unit의 개수를 늘릴수록 Accuracy 값이 높아지는 것을 볼 수 있다.
total_df %>%
filter(is.na(drop_out_rate)) %>% filter(data == 'train') %>%
select(loss, accuracy, data, unit, layer_num, activation) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation
## 1 0.3651236 0.9445277 train 100 4 sigmoid
## 2 0.7252796 0.9387806 train 300 4 relu
## 3 0.5338585 0.9380310 train 400 4 sigmoid
## 4 0.6699988 0.9377811 train 400 4 relu
## 5 0.3628501 0.9375312 train 50 4 sigmoid
total_df %>%
filter(is.na(drop_out_rate)) %>% filter(data == 'test') %>%
select(loss, accuracy, data, unit, layer_num, activation) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation
## 1 0.7548100 0.9258517 test 300 4 relu
## 2 0.7311823 0.9218437 test 400 4 relu
## 3 0.7815405 0.9158317 test 100 4 relu
## 4 0.9299697 0.9158317 test 200 4 relu
## 5 0.7446094 0.9108216 test 200 2 relu
이번에는 그래프가 아닌 각 결과의 값을 train과 test 각각에 대해 데이터프레임 형태로 그대로 출력해보도록 한다.
위 결과를 통해 앞서 이야기 했던 것처럼 layer의 수가 많고, unit의 개수가 큰 모델들이 비교적 결과가 좋았던 것을 볼 수 있다.
또한 test 데이터에서 성능이 좋았던 활성화 함수는 모두 Relu 였던 것을 확인할 수 있다.
total_df$drop_out_rate <- NA
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
# (1) Model Setting
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
# (2) Training
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
# (3) Model Output
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$layer_num <- 2
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}
# Output layer를 포함하여 Layer의 개수가 3개
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$layer_num <- 3
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}
# Output layer를 포함하여 Layer의 개수가 4개
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$layer_num <- 4
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}
model <- keras_model_sequential()
model %>%
layer_dense(units = 100, activation = 'sigmoid', input_shape = c(400)) %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 100, activation = 'sigmoid') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 100, activation = 'sigmoid') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 10, activation = 'softmax')
summary(model)
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_6 (Dense) (None, 100) 40100
##
## dropout_2 (Dropout) (None, 100) 0
##
## dense_5 (Dense) (None, 100) 10100
##
## dropout_1 (Dropout) (None, 100) 0
##
## dense_4 (Dense) (None, 100) 10100
##
## dropout (Dropout) (None, 100) 0
##
## dense_3 (Dense) (None, 10) 1010
##
## ================================================================================
## Total params: 61,310
## Trainable params: 61,310
## Non-trainable params: 0
## ________________________________________________________________________________
## Training
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 200, validation_split = 0.1)
model %>% evaluate(train_x, train_y)
## loss accuracy
## 0.5113497 0.9345328
model %>% evaluate(test_x, test_y)
## loss accuracy
## 0.6352201 0.8957916
plot(history)
이번에는 기존에 1번과 2번에서 했던 unit의 개수와 layer의 개수, activation 함수를 바꿨던 조건에 drop out을 추가해본다.
모델이 학습하는 경우에서 기존과 동일하게 unit_num과 activation_fun 그리고 rate_num을 추가해주도록 한다.
이를 통해서 각 unit의 개수와 activation 함수 그리고 rate_num에 따른 결과가 생성된다.
이를 다시 한번 plotly 패키지를 활용하여 최종 결과를 그려보도록 한다.
d_o_2 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.2) %>%
filter(activation == 'sigmoid')
d_o_3 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.3) %>%
filter(activation == 'sigmoid')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.1) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = accuracy,
text = paste('Accuracy: ', round(accuracy, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'drop_out_rate', colour = 'drop_out_rate=0.1')) +
geom_point(color = 'black', size = 1) +
geom_line(data = d_o_2, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.2')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = d_o_3, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.3')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('drop_out_rate=0.1' = 'red',
'drop_out_rate=0.2' = 'blue',
'drop_out_rate=0.3' = 'green')) +
ylim(c(0.85, 0.95)) + ggtitle('Change # of layer, unit of layer, drop out rate - sigmoid (Accuracy)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
d_o_2 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.2) %>%
filter(activation == 'relu')
d_o_3 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.3) %>%
filter(activation == 'relu')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.1) %>%
filter(activation == 'relu') %>%
ggplot(aes(x = unit, y = accuracy,
text = paste('Accuracy: ', round(accuracy, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'drop_out_rate', colour = 'drop_out_rate=0.1')) +
geom_point(color = 'black', size = 1) +
geom_line(data = d_o_2, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.2')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = d_o_3, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.3')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('drop_out_rate=0.1' = 'red',
'drop_out_rate=0.2' = 'blue',
'drop_out_rate=0.3' = 'green')) +
ylim(c(0.85, 0.95)) + ggtitle('Change # of layer, unit of layer, drop out rate - relu (Accuracy)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
d_o_2 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.2) %>%
filter(activation == 'sigmoid')
d_o_3 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.3) %>%
filter(activation == 'sigmoid')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.1) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = loss,
text = paste('Loss: ', round(loss, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'drop_out_rate', colour = 'drop_out_rate=0.1')) +
geom_point(color = 'black', size = 1) +
geom_line(data = d_o_2, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.2')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = d_o_3, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.3')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('drop_out_rate=0.1' = 'red',
'drop_out_rate=0.2' = 'blue',
'drop_out_rate=0.3' = 'green')) +
ylim(c(0.3, 1)) + ggtitle('Change # of layer, unit of layer, drop out rate - sigmoid (Loss)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
d_o_2 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.2) %>%
filter(activation == 'relu')
d_o_3 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.3) %>%
filter(activation == 'relu')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(is.na(batch_size)) %>%
filter(drop_out_rate == 0.1) %>%
filter(activation == 'relu') %>%
ggplot(aes(x = unit, y = loss,
text = paste('Loss: ', round(loss, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation))) +
geom_line(aes(group = 'drop_out_rate', colour = 'drop_out_rate=0.1')) +
geom_point(color = 'black', size = 1) +
geom_line(data = d_o_2, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.2')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = d_o_3, aes(group = 'drop_out_rate', color = 'drop_out_rate=0.3')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('drop_out_rate=0.1' = 'red',
'drop_out_rate=0.2' = 'blue',
'drop_out_rate=0.3' = 'green')) +
ylim(c(0.3, 1)) + ggtitle('Change # of layer, unit of layer, drop out rate - relu (Loss)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
이번에는 앞서 했던 Layer 수와 Unit 수, Activation 함수를 비롯하여 drop out을 추가하여 rate를 변경한 결과를 확인한다.
일반적으로 drop out은 딥러닝 모델의 과적합을 방지하기 위해 사용되는 것으로 알려져 있다.
그래서 hidden layer에서 주어진 확률 정도로 유닛들을 제거하는 역할을 한다.
이 경우에 가장 먼저 train과 test를 비교할 경우, 그 차이가 약간은 도드라진다는 것을 확인할 수 있다.
train 데이터에서는 평균적으로 Accuracy가 0.92 정도되고, test 데이터에서는 0.9 정도 되는 것을 확인할 수 있다.
또한 drop out을 추가하고, layer의 개수와 unit의 개수를 조절할수록 성능이 개선되는 것을 확인할 수 있다.
total_df %>%
filter(!is.na(drop_out_rate)) %>% filter(is.na(batch_size)) %>% filter(data == 'train') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate
## 1 0.5913615 0.9472764 train 400 4 relu 0.3
## 2 0.6173096 0.9457771 train 400 3 relu 0.1
## 3 0.4044213 0.9450275 train 200 4 sigmoid 0.1
## 4 0.4746284 0.9440280 train 200 4 relu 0.3
## 5 0.3839283 0.9427786 train 200 4 sigmoid 0.2
total_df %>%
filter(!is.na(drop_out_rate)) %>% filter(is.na(batch_size)) %>% filter(data == 'test') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate
## 1 0.6407647 0.9268537 test 400 3 relu 0.1
## 2 0.4889172 0.9238477 test 200 4 relu 0.3
## 3 0.6828322 0.9238477 test 400 4 relu 0.3
## 4 0.8664092 0.9188377 test 400 3 relu 0.3
## 5 0.6992087 0.9178357 test 200 3 relu 0.2
이번에도 단순히 그래프 말고, 데이터프레임 형태로 결과를 출력해보도록 한다.
결과를 통해 알 수 있듯, 대다수의 상위 accuracy 결과는 drop out이 포함되어 있다는 것을 확인할 수 있다.
test 데이터에서도 역시 unit의 개수와 layer 수가 많으면서, drop out이 포함된 케이스가 accuracy가 가장 높았음을 알 수 있다.
total_df$batch_size <- NA
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
for (batch_num in c(32, 64, 128, 256)){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, batch_size = batch_num, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$batch_size <- batch_num
df$layer_num <- 2
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}}
# Output layer를 포함하여 Layer의 개수가 3개
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
for (batch_num in c(32, 64, 128, 256)){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, batch_size = batch_num, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$batch_size <- batch_num
df$layer_num <- 3
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}}
# Output layer를 포함하여 Layer의 개수가 4개
for (unit_num in c(25, 50, 100, 200, 300, 400)){
for (activation_fun in c('sigmoid', 'relu')){
for (rate_num in c(0.1, 0.2, 0.3)){
for (batch_num in c(32, 64, 128, 256)){
model <- keras_model_sequential()
model %>%
layer_dense(units = unit_num, activation = activation_fun, input_shape = c(400)) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = unit_num, activation = activation_fun) %>%
layer_dropout(rate = rate_num) %>%
layer_dense(units = 10, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 100, batch_size = batch_num, validation_split = 0.1)
train <- t(data.frame(model %>% evaluate(train_x, train_y)))
test <- t(data.frame(model %>% evaluate(test_x, test_y)))
df <- rbind(train, test) %>% as_tibble()
df$data <- c('train', 'test')
df$unit <- unit_num
df$drop_out_rate <- rate_num
df$batch_size <- batch_num
df$layer_num <- 4
df$activation <- activation_fun
total_df <- rbind(total_df, df)}}}}
model <- keras_model_sequential()
model %>%
layer_dense(units = 300, activation = 'sigmoid', input_shape = c(400)) %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 300, activation = 'sigmoid') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 300, activation = 'sigmoid') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 10, activation = 'softmax')
summary(model)
## Model: "sequential_2"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_10 (Dense) (None, 300) 120300
##
## dropout_5 (Dropout) (None, 300) 0
##
## dense_9 (Dense) (None, 300) 90300
##
## dropout_4 (Dropout) (None, 300) 0
##
## dense_8 (Dense) (None, 300) 90300
##
## dropout_3 (Dropout) (None, 300) 0
##
## dense_7 (Dense) (None, 10) 3010
##
## ================================================================================
## Total params: 303,910
## Trainable params: 303,910
## Non-trainable params: 0
## ________________________________________________________________________________
## Training
model %>% compile(
loss = 'categorical_crossentropy', optimizer = 'adam', metrics = 'accuracy')
history <- model %>% fit(
train_x, train_y, epochs = 200, validation_split = 0.1, batch_size = 128)
model %>% evaluate(train_x, train_y)
## loss accuracy
## 0.4168667 0.9400300
model %>% evaluate(test_x, test_y)
## loss accuracy
## 0.5265291 0.9148297
plot(history)
가장 마지막으로는 위에서 했던 layer 수 변경, units 수 변경, drop out 추가를 비롯하여 batch size를 설정한 것을 추가하도록 한다.
기존의 파라미터에 batch size를 추가한 결과를 덧붙여서 모델의 각 결과들을 저장하도록 한다.
그렇다면 이번 경우도 plot을 통해 어떻게 나오는지 확인해보도록 한다.
batch_64 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 64) %>%
filter(activation == 'sigmoid')
batch_128 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 128) %>%
filter(activation == 'sigmoid')
batch_256 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 256) %>%
filter(activation == 'sigmoid')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 32) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = accuracy,
text = paste('Accuracy: ', round(accuracy, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation,
'<br>Drop_out_rate:', drop_out_rate,
'<br>Batch_size: ', batch_size))) +
geom_line(aes(group = 'batch_size', colour = 'batch_size = 32')) + geom_point(color = 'black', size = 1) +
geom_line(data = batch_64, aes(group = 'batch_size', color = 'batch_size = 64')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_128, aes(group = 'batch_size', color = 'batch_size = 128')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_256, aes(group = 'batch_size', color = 'batch_size = 256')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('batch_size = 32' = 'yellow',
'batch_size = 64' = 'red',
'batch_size = 128' = 'blue',
'batch_size = 256' = 'green')) +
ylim(c(0.85, 0.95)) + ggtitle('Changing # of layer, unit of layer, drop out rate, batch size - sigmoid (Accuracy)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
batch_64 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 64) %>%
filter(activation == 'relu')
batch_128 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 128) %>%
filter(activation == 'relu')
batch_256 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 256) %>%
filter(activation == 'relu')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 32) %>%
filter(activation == 'relu') %>%
ggplot(aes(x = unit, y = accuracy,
text = paste('Accuracy: ', round(accuracy, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation,
'<br>Drop_out_rate:', drop_out_rate,
'<br>Batch_size: ', batch_size))) +
geom_line(aes(group = 'batch_size', colour = 'batch_size = 32')) + geom_point(color = 'black', size = 1) +
geom_line(data = batch_64, aes(group = 'batch_size', color = 'batch_size = 64')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_128, aes(group = 'batch_size', color = 'batch_size = 128')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_256, aes(group = 'batch_size', color = 'batch_size = 256')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('batch_size = 32' = 'yellow',
'batch_size = 64' = 'red',
'batch_size = 128' = 'blue',
'batch_size = 256' = 'green')) +
ylim(c(0.85, 0.95)) + ggtitle('Changing # of layer, unit of layer, drop out rate, batch size - relu (Accuracy)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
batch_64 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 64) %>%
filter(activation == 'sigmoid')
batch_128 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 128) %>%
filter(activation == 'sigmoid')
batch_256 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 256) %>%
filter(activation == 'sigmoid')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 32) %>%
filter(activation == 'sigmoid') %>%
ggplot(aes(x = unit, y = loss,
text = paste('Loss: ', round(loss, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation,
'<br>Drop_out_rate:', drop_out_rate,
'<br>Batch_size: ', batch_size))) +
geom_line(aes(group = 'batch_size', colour = 'batch_size = 32')) + geom_point(color = 'black', size = 1) +
geom_line(data = batch_64, aes(group = 'batch_size', color = 'batch_size = 64')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_128, aes(group = 'batch_size', color = 'batch_size = 128')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_256, aes(group = 'batch_size', color = 'batch_size = 256')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('batch_size = 32' = 'yellow',
'batch_size = 64' = 'red',
'batch_size = 128' = 'blue',
'batch_size = 256' = 'green')) +
ylim(c(0.3, 1)) + ggtitle('Changing # of layer, unit of layer, drop out rate, batch size - sigmoid (Loss)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
batch_64 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 64) %>%
filter(activation == 'relu')
batch_128 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 128) %>%
filter(activation == 'relu')
batch_256 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 256) %>%
filter(activation == 'relu')
g1 <- total_df %>%
filter(!is.na(drop_out_rate)) %>%
filter(!is.na(batch_size)) %>%
filter(batch_size == 32) %>%
filter(activation == 'relu') %>%
ggplot(aes(x = unit, y = loss,
text = paste('Loss: ', round(loss, 3),
'<br>Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation,
'<br>Drop_out_rate:', drop_out_rate,
'<br>Batch_size: ', batch_size))) +
geom_line(aes(group = 'batch_size', colour = 'batch_size = 32')) + geom_point(color = 'black', size = 1) +
geom_line(data = batch_64, aes(group = 'batch_size', color = 'batch_size = 64')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_128, aes(group = 'batch_size', color = 'batch_size = 128')) +
geom_point(data = d_o_2, color = 'black', size = 1) +
geom_line(data = batch_256, aes(group = 'batch_size', color = 'batch_size = 256')) +
geom_point(data = d_o_3, color = 'black', size = 1) +
facet_grid(layer_num ~ data) + scale_color_manual(name = 'Activation Function',
values = c('batch_size = 32' = 'yellow',
'batch_size = 64' = 'red',
'batch_size = 128' = 'blue',
'batch_size = 256' = 'green')) +
ylim(c(0.3, 1)) + ggtitle('Changing # of layer, unit of layer, drop out rate, batch size - relu (Loss)') +
theme(plot.title = element_text(hjust = 0.5))
ggplotly(g1, tooltip = 'text')
가장 마지막으로는 Layer 수, Unit 수, Activation 함수, drop out을 비롯하여 Batch size를 조절해본다.
학습이 진행 될 때, 읽는 데이터를 얼마나 할 것인가를 의미하는 배치 사이즈는, 학습 속도에도 영향을 주곤 한다.
결과를 확인해보면, 전반적으로 Batch size가 128이나 256이 넘어가게 되면, 오히려 Accuracy 값이 감소하는 것을 볼 수 있다.
그래프에 내용이 너무 많아서 실제 결과를 데이터프레임 형식으로 출력해보도록 한다.
total_df %>%
filter(!is.na(drop_out_rate)) %>% filter(!is.na(batch_size)) %>% filter(data == 'train') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate, batch_size) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate batch_size
## 1 0.4044045 0.9492754 train 200 4 relu 0.2 64
## 2 0.3652179 0.9447776 train 50 4 relu 0.1 32
## 3 0.5846784 0.9437781 train 400 4 relu 0.2 64
## 4 0.3764162 0.9427786 train 400 4 relu 0.1 128
## 5 0.5307893 0.9425287 train 300 3 relu 0.3 32
total_df %>%
filter(!is.na(drop_out_rate)) %>% filter(!is.na(batch_size)) %>% filter(data == 'test') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate, batch_size) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate batch_size
## 1 0.4820182 0.9318637 test 200 4 relu 0.2 64
## 2 0.5813320 0.9288577 test 300 3 relu 0.3 32
## 3 0.4165447 0.9288577 test 300 4 relu 0.1 128
## 4 0.6412730 0.9258517 test 400 4 relu 0.2 64
## 5 0.4145692 0.9248497 test 400 4 relu 0.1 128
그래프 상에서는 직관적으로 확인하기가 조금 어려웠는데, 그래서 데이터를 직접 보면서 결과를 확인해본다.
앞서 이야기 했듯, 너무 큰 배치 사이즈보다는 32나 64 정도의 값에서 가장 좋은 성능을 보이는 것을 확인할 수 있다.
이 파라미터 역시 적절히 unit과 layer 수가 있는 경우에 결과가 잘 나오는 것을 알 수 있다.
결국 unit과 layer 수, drop out, batch size를 모두 적절히 준, 모델의 결과가 가장 최적이라는 것을 알 수 있었다ㄲ
pal <- c('red', 'blue')
total_df %>%
plot_ly(x = ~loss, y = ~accuracy, color = ~data, colors = pal,
text = ~paste('Unit: ', unit,
'<br>Layer_num: ', layer_num,
'<br>Activation: ', activation,
'<br>Drop_out_rate:', drop_out_rate,
'<br>Batch_size: ', batch_size))
최종적으로 모든 데이터에 대하여 plotly를 활용하여 그래프를 만들어보도록 한다.
X축을 loss 값으로, Y축을 accuracy로 하여 좌상단에 있는 점들이 loss는 낮으면서 accuracy는 높은 이상적인 결과라고 할 수 있다.
전반적으로 train 데이터에 비해 test 데이터의 성능이 조금 떨어지는 것을 확인할 수 있다.
plotly 결과로 반응형을 통해 각 파라미터를 확인할 수 있다.
total_df %>%
filter(data == 'train') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate, batch_size) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate batch_size
## 1 0.4044045 0.9492754 train 200 4 relu 0.2 64
## 2 0.5913615 0.9472764 train 400 4 relu 0.3 NA
## 3 0.6173096 0.9457771 train 400 3 relu 0.1 NA
## 4 0.4044213 0.9450275 train 200 4 sigmoid 0.1 NA
## 5 0.3652179 0.9447776 train 50 4 relu 0.1 32
total_df %>%
filter(!is.na(drop_out_rate)) %>% filter(!is.na(batch_size)) %>% filter(data == 'test') %>%
select(loss, accuracy, data, unit, layer_num, activation, drop_out_rate, batch_size) %>%
arrange(desc(accuracy)) %>% head(5)
## loss accuracy data unit layer_num activation drop_out_rate batch_size
## 1 0.4820182 0.9318637 test 200 4 relu 0.2 64
## 2 0.5813320 0.9288577 test 300 3 relu 0.3 32
## 3 0.4165447 0.9288577 test 300 4 relu 0.1 128
## 4 0.6412730 0.9258517 test 400 4 relu 0.2 64
## 5 0.4145692 0.9248497 test 400 4 relu 0.1 128
전체 테스트 결과에 대해 가장 좋았던 파라미터는 무엇인지 가장 마지막으로 확인해보도록 한다.
train 데이터의 경우에는 배치 사이즈를 64로 설정하고, unit과 layer 수가 적절히 크고, drop out이 있는 경우에 가장 최적이었다.
train에서의 accuracy는 약 0.949 정도가 나왔던 것을 확인할 수 있다.
한편, test 데이터의 경우에는 앞서 train 데이터의 조건과 동일한 경우에 가장 성능이 좋았던 것을 볼 수 있었다.
unit은 200, layer 수는 4개, activation 함수는 relu, drop out은 0.2 그리고 batch size를 64로 한 경우가 최적이었다.
이때의 train의 accuracy는 0.949, test의 accuracy는 0.932 정도로 그 차이가 크지 않아서 과적합이라고 하기도 어려울 것이다.
이와 같은 다양한 조건들을 통해 파라미터를 조정하면서 최적의 성능을 갖는 조건은 무엇인지 확인해볼 수 있었다.